#Install packages:
#Source: https://trinkerrstuff.wordpress.com/my-r-packages/qdap/

#if (!require("pacman")) install.packages("pacman")
#pacman::p_load(sentimentr, dplyr, magrittr)
#install.packages("devtools")
#install_github("trinker/qdapDictionaries")
#install_github("trinker/qdapRegex")
#install_github("trinker/qdapTools")
#install_github("trinker/qdap")
#install.packages("quanteda")
#install.packages("sentimentr")
#install.packages("ndjson")
#install.packages("NLP")
#install.packages("dplyr")
#install.packages("tidyr")
#install.packages("tm")
#install.packages("corpus")
#install.packages("syuzhet")
#install.packages("plotly")
#install.packages("wordcloud")
library(devtools)
## Loading required package: usethis
library(tm)
## Loading required package: NLP
library(qdap)
## Loading required package: qdapDictionaries
## Loading required package: qdapRegex
## Loading required package: qdapTools
## Loading required package: RColorBrewer
## 
## Attaching package: 'qdap'
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, as.TermDocumentMatrix
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following objects are masked from 'package:base':
## 
##     Filter, proportions
library(sentimentr)
## Registered S3 methods overwritten by 'textclean':
##   method           from
##   print.check_text qdap
##   print.sub_holder qdap
library(ndjson)
## 
## Attaching package: 'ndjson'
## The following object is masked from 'package:qdapRegex':
## 
##     validate
library(corpus)
library(syuzhet)
## 
## Attaching package: 'syuzhet'
## The following object is masked from 'package:sentimentr':
## 
##     get_sentences
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:qdapTools':
## 
##     id
## The following object is masked from 'package:qdapRegex':
## 
##     explain
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(quanteda)
## Package version: 3.1.0
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
## 
##     stopwords
## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:qdapRegex':
## 
##     %+%
## The following object is masked from 'package:NLP':
## 
##     annotate
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:sentimentr':
## 
##     highlight
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(wordcloud)

#a good package, also takes into account  negative words and amplifiers
#see: http://www.inside-r.org/packages/cran/qdap/docs/polarity
#getwd()
#setwd("C:/Ryerson University - Capstone project/Module 2/EIEEE - Large dataset/Combined")
#Read in original data set May 2020
data_set_may <- read.csv("corona_tweets_59 May 2020", header = T, sep = ",")
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## embedded nul(s) found in input
#take a sample of 1,000, set seed to replicate results across several analysis of methods:
set.seed(1000)
rawData <- data_set_may[sample(nrow(data_set_may), size = 1000), ]
#write.csv(rawData,'rawData.csv')

str(rawData)
## 'data.frame':    1000 obs. of  35 variables:
##  $ coordinates               : chr  "" "" "" "" ...
##  $ created_at                : chr  "Sat May 16 23:31:16 +0000 2020" "Sat May 16 18:57:19 +0000 2020" "Sun May 17 02:30:46 +0000 2020" "Sat May 16 23:33:45 +0000 2020" ...
##  $ hashtags                  : chr  "" "" "" "" ...
##  $ media                     : chr  "" "" "" "" ...
##  $ urls                      : chr  "" "" "" "https://www.nbcnews.com/now/video/officials-warn-chinese-hackers-are-targeting-u-s-coronavirus-research-83422277503" ...
##  $ favorite_count            : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ id                        : num  1.26e+18 1.26e+18 1.26e+18 1.26e+18 1.26e+18 ...
##  $ in_reply_to_screen_name   : chr  "" "" "" "" ...
##  $ in_reply_to_status_id     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ in_reply_to_user_id       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ lang                      : chr  "en" "en" "en" "en" ...
##  $ place                     : chr  "" "" "" "" ...
##  $ possibly_sensitive        : chr  "" "" "" "false" ...
##  $ quote_id                  : num  NA NA NA NA 1.26e+18 ...
##  $ retweet_count             : int  25 338 441 0 0 12022 4 11 1 0 ...
##  $ retweet_id                : num  1.26e+18 1.26e+18 1.26e+18 NA NA ...
##  $ retweet_screen_name       : chr  "business" "Suewilson91" "BreitbartNews" "" ...
##  $ source                    : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
##  $ text                      : chr  "Many Americans have proven diligent in staying home to limit the spread of Covid-19. But their acceptance of so"| __truncated__ "Matt Hancock accused of being a 'liar' and told to resign after claiming he 'protected' care homes from the sta"| __truncated__ "Secretary of State @MikePompeo told Breitbart News that President Donald Trump is â\200œcommittedâ\200\235 to h"| __truncated__ "Officials warn Chinese hackers are targeting U.S. coronavirus research https://t.co/7tnGhf85MS via @nbcnews" ...
##  $ tweet_url                 : chr  "https://twitter.com/lemnosalt/status/1261801422430978048" "https://twitter.com/Hilary72926522/status/1261732478529740807" "https://twitter.com/BillSpears724/status/1261846593453637632" "https://twitter.com/Bet_the_ChE/status/1261802044354113536" ...
##  $ user_created_at           : chr  "Tue Feb 10 00:25:20 +0000 2009" "Sun Dec 01 15:12:16 +0000 2019" "Fri Jan 06 19:30:57 +0000 2017" "Sun Aug 11 19:49:05 +0000 2013" ...
##  $ user_id                   : num  2.05e+07 1.20e+18 8.17e+17 1.66e+09 1.26e+18 ...
##  $ user_default_profile_image: chr  "false" "false" "false" "false" ...
##  $ user_description          : chr  "Groovy chick and media producer. All snark. No bite." "" "" "Just some engineer." ...
##  $ user_favourites_count     : int  92045 19675 1 46635 2788 1371 1230 18960 4 34505 ...
##  $ user_followers_count      : int  1469 45 65 263 426 97 109 2151 375 12607 ...
##  $ user_friends_count        : int  2526 229 228 1960 267 240 274 4846 227 12722 ...
##  $ user_listed_count         : int  73 0 1 1 4 1 0 15 13 106 ...
##  $ user_location             : chr  "" "New Forest" "" "United States" ...
##  $ user_name                 : chr  "Lynn" "Hilary ðŸ’\231" "Bill Spears" "Bet" ...
##  $ user_screen_name          : chr  "lemnosalt" "Hilary72926522" "BillSpears724" "Bet_the_ChE" ...
##  $ user_statuses_count       : int  35678 5272 24796 23697 1028 317 279 84594 14606 252203 ...
##  $ user_time_zone            : logi  NA NA NA NA NA NA ...
##  $ user_urls                 : chr  "http://lynnmargherita.com" "" "" "" ...
##  $ user_verified             : chr  "false" "false" "false" "false" ...
#create a corpus:
importdocs = corpus(rawData, text_field = 'text')
#preprocessing of data
importdocs <- gsub("'", "", importdocs)  # remove apostrophes
importdocs <- gsub("[[:punct:]]", " ", importdocs)  # replace punctuation with space
importdocs <- gsub("[[:cntrl:]]", " ", importdocs)  # replace control characters with space
importdocs <- gsub("^[[:space:]]+", "", importdocs) # remove whitespace at beginning of documents
importdocs <- gsub("[[:space:]]+$", "", importdocs) # remove whitespace at end of documents
importdocs <- tolower(importdocs)


# CLEANING TWEETS

importdocs=gsub("&amp", "", importdocs)

importdocs = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", importdocs)
importdocs = gsub("@\\w+", "", importdocs)

importdocs = gsub("[[:digit:]]", "", importdocs)
importdocs = gsub("http\\w+", "", importdocs)
importdocs = gsub("[ \t]{2,}", "", importdocs)
importdocs = gsub("^\\s+|\\s+$", "", importdocs)

importdocs <- iconv(importdocs, "UTF-8", "ASCII", sub="")

str(importdocs)
##  'corpus' Named chr [1:1000] "many americans have proven diligent in staying home to limit the spread of covidbut their acceptance of social "| __truncated__ ...
##  - attr(*, "names")= chr [1:1000] "text1" "text2" "text3" "text4" ...
##  - attr(*, "docvars")='data.frame':  1000 obs. of  37 variables:
##   ..$ docname_                  : chr [1:1000] "text1" "text2" "text3" "text4" ...
##   ..$ docid_                    : Factor w/ 1000 levels "text1","text2",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ segid_                    : int [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
##   ..$ coordinates               : chr [1:1000] "" "" "" "" ...
##   ..$ created_at                : chr [1:1000] "Sat May 16 23:31:16 +0000 2020" "Sat May 16 18:57:19 +0000 2020" "Sun May 17 02:30:46 +0000 2020" "Sat May 16 23:33:45 +0000 2020" ...
##   ..$ hashtags                  : chr [1:1000] "" "" "" "" ...
##   ..$ media                     : chr [1:1000] "" "" "" "" ...
##   ..$ urls                      : chr [1:1000] "" "" "" "https://www.nbcnews.com/now/video/officials-warn-chinese-hackers-are-targeting-u-s-coronavirus-research-83422277503" ...
##   ..$ favorite_count            : int [1:1000] 0 0 0 0 0 0 0 0 1 1 ...
##   ..$ id                        : num [1:1000] 1.26e+18 1.26e+18 1.26e+18 1.26e+18 1.26e+18 ...
##   ..$ in_reply_to_screen_name   : chr [1:1000] "" "" "" "" ...
##   ..$ in_reply_to_status_id     : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##   ..$ in_reply_to_user_id       : num [1:1000] NA NA NA NA NA NA NA NA NA NA ...
##   ..$ lang                      : chr [1:1000] "en" "en" "en" "en" ...
##   ..$ place                     : chr [1:1000] "" "" "" "" ...
##   ..$ possibly_sensitive        : chr [1:1000] "" "" "" "false" ...
##   ..$ quote_id                  : num [1:1000] NA NA NA NA 1.26e+18 ...
##   ..$ retweet_count             : int [1:1000] 25 338 441 0 0 12022 4 11 1 0 ...
##   ..$ retweet_id                : num [1:1000] 1.26e+18 1.26e+18 1.26e+18 NA NA ...
##   ..$ retweet_screen_name       : chr [1:1000] "business" "Suewilson91" "BreitbartNews" "" ...
##   ..$ source                    : chr [1:1000] "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
##   ..$ tweet_url                 : chr [1:1000] "https://twitter.com/lemnosalt/status/1261801422430978048" "https://twitter.com/Hilary72926522/status/1261732478529740807" "https://twitter.com/BillSpears724/status/1261846593453637632" "https://twitter.com/Bet_the_ChE/status/1261802044354113536" ...
##   ..$ user_created_at           : chr [1:1000] "Tue Feb 10 00:25:20 +0000 2009" "Sun Dec 01 15:12:16 +0000 2019" "Fri Jan 06 19:30:57 +0000 2017" "Sun Aug 11 19:49:05 +0000 2013" ...
##   ..$ user_id                   : num [1:1000] 2.05e+07 1.20e+18 8.17e+17 1.66e+09 1.26e+18 ...
##   ..$ user_default_profile_image: chr [1:1000] "false" "false" "false" "false" ...
##   ..$ user_description          : chr [1:1000] "Groovy chick and media producer. All snark. No bite." "" "" "Just some engineer." ...
##   ..$ user_favourites_count     : int [1:1000] 92045 19675 1 46635 2788 1371 1230 18960 4 34505 ...
##   ..$ user_followers_count      : int [1:1000] 1469 45 65 263 426 97 109 2151 375 12607 ...
##   ..$ user_friends_count        : int [1:1000] 2526 229 228 1960 267 240 274 4846 227 12722 ...
##   ..$ user_listed_count         : int [1:1000] 73 0 1 1 4 1 0 15 13 106 ...
##   ..$ user_location             : chr [1:1000] "" "New Forest" "" "United States" ...
##   ..$ user_name                 : chr [1:1000] "Lynn" "Hilary ðŸ’\231" "Bill Spears" "Bet" ...
##   ..$ user_screen_name          : chr [1:1000] "lemnosalt" "Hilary72926522" "BillSpears724" "Bet_the_ChE" ...
##   ..$ user_statuses_count       : int [1:1000] 35678 5272 24796 23697 1028 317 279 84594 14606 252203 ...
##   ..$ user_time_zone            : logi [1:1000] NA NA NA NA NA NA ...
##   ..$ user_urls                 : chr [1:1000] "http://lynnmargherita.com" "" "" "" ...
##   ..$ user_verified             : chr [1:1000] "false" "false" "false" "false" ...
##  - attr(*, "meta")=List of 3
##   ..$ system:List of 6
##   .. ..$ package-version:Classes 'package_version', 'numeric_version'  hidden list of 1
##   .. .. ..$ : int [1:3] 3 1 0
##   .. ..$ r-version      :Classes 'R_system_version', 'package_version', 'numeric_version'  hidden list of 1
##   .. .. ..$ : int [1:3] 4 1 1
##   .. ..$ system         : Named chr [1:3] "Windows" "x86-64" "jbloos"
##   .. .. ..- attr(*, "names")= chr [1:3] "sysname" "machine" "user"
##   .. ..$ directory      : chr "C:/Ryerson University - Capstone project/Module 2/EIEEE - Large dataset/Combined"
##   .. ..$ created        : Date[1:1], format: "2021-11-08"
##   .. ..$ source         : chr "data.frame"
##   ..$ object:List of 2
##   .. ..$ unit   : chr "documents"
##   .. ..$ summary:List of 2
##   .. .. ..$ hash: chr(0) 
##   .. .. ..$ data: NULL
##   ..$ user  : list()
mycorpus <- get_sentences(importdocs)
mysentiment <- sentiment(mycorpus)
mysentiment
##       element_id sentence_id word_count    sentiment
##    1:          1           1         25  0.310000000
##    2:          2           1         32  0.008838835
##    3:          3           1         38 -0.129777137
##    4:          4           1         12 -0.216506351
##    5:          5           1         19  0.149120227
##   ---                                               
##  996:        996           1         38  0.129777137
##  997:        997           1         15  0.232379001
##  998:        998           1          7  0.377964473
##  999:        999           1         41 -0.312347524
## 1000:       1000           1         14  0.000000000
# run overall score, result overall neutral to perhaps moderate positive
summary(mysentiment$sentiment)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.37620 -0.13606  0.00000  0.01764  0.18415  0.92095
#results expressed in histogram

qplot(mysentiment$sentiment,   geom="histogram",binwidth=0.1,main="Review Sentiment Histogram")

#source: https://www.programmingr.com/sentiment-analysis/

#returns the individual words along with their polarity strength and counts.
t = extract_sentiment_terms(mycorpus) 
attributes(t)$count
##             words polarity  n
##    1:        care     1.00 30
##    2:      please     1.00 13
##    3:  understand     1.00  8
##    4:         top     1.00  7
##    5:       truth     1.00  5
##   ---                        
## 7829:  would have    -1.05  3
## 7830:  could have    -1.05  3
## 7831: should have    -1.05  2
## 7832:    too many    -2.00  6
## 7833:    too much    -2.00  1
#show positive and negative word use:
head(t,20)
##     element_id sentence_id                 negative                   positive
##  1:          1           1                    limit proven,diligent,acceptance
##  2:          2           1      accused,liar,resign      protected,care,league
##  3:          3           1 trump,communist,pandemic                accountable
##  4:          4           1                     warn                           
##  5:          5           1                                           like,good
##  6:          6           1                     deny                           
##  7:          7           1                               obtaining,results,art
##  8:          8           1                      bad                           
##  9:          9           1                                                    
## 10:         10           1                 threaten                           
## 11:         11           1                      cut               work,freedom
## 12:         12           1                                                safe
## 13:         13           1                                                    
## 14:         14           1              death,death           content,measured
## 15:         15           1                                                    
## 16:         16           1                                                    
## 17:         17           1                                  confirmed,positive
## 18:         18           1                                                good
## 19:         19           1                                                care
## 20:         20           1                 ignorant            flatter,flatter
# The emotion() function returns the rate of emotion per sentence. A data frame is returned by this function and of interest to us are the two columns: emotion type and emotion. Emotion indicates the strength of emotion present in the sentence.
emotion(mycorpus[1:2])
##     element_id sentence_id word_count         emotion_type emotion_count
##  1:          1           1         25                anger             0
##  2:          1           1         25              disgust             0
##  3:          1           1         25                 fear             0
##  4:          1           1         25              sadness             0
##  5:          1           1         25                trust             1
##  6:          1           1         25        anger_negated             0
##  7:          1           1         25         anticipation             0
##  8:          1           1         25 anticipation_negated             0
##  9:          1           1         25      disgust_negated             0
## 10:          1           1         25         fear_negated             0
## 11:          1           1         25                  joy             0
## 12:          1           1         25          joy_negated             0
## 13:          1           1         25      sadness_negated             0
## 14:          1           1         25             surprise             0
## 15:          1           1         25     surprise_negated             0
## 16:          1           1         25        trust_negated             0
## 17:          2           1         32                anger             2
## 18:          2           1         32              disgust             2
## 19:          2           1         32                 fear             2
## 20:          2           1         32              sadness             1
## 21:          2           1         32                trust             1
## 22:          2           1         32        anger_negated             0
## 23:          2           1         32         anticipation             0
## 24:          2           1         32 anticipation_negated             0
## 25:          2           1         32      disgust_negated             0
## 26:          2           1         32         fear_negated             0
## 27:          2           1         32                  joy             0
## 28:          2           1         32          joy_negated             0
## 29:          2           1         32      sadness_negated             0
## 30:          2           1         32             surprise             0
## 31:          2           1         32     surprise_negated             0
## 32:          2           1         32        trust_negated             0
##     element_id sentence_id word_count         emotion_type emotion_count
##     emotion
##  1: 0.00000
##  2: 0.00000
##  3: 0.00000
##  4: 0.00000
##  5: 0.04000
##  6: 0.00000
##  7: 0.00000
##  8: 0.00000
##  9: 0.00000
## 10: 0.00000
## 11: 0.00000
## 12: 0.00000
## 13: 0.00000
## 14: 0.00000
## 15: 0.00000
## 16: 0.00000
## 17: 0.06250
## 18: 0.06250
## 19: 0.06250
## 20: 0.03125
## 21: 0.03125
## 22: 0.00000
## 23: 0.00000
## 24: 0.00000
## 25: 0.00000
## 26: 0.00000
## 27: 0.00000
## 28: 0.00000
## 29: 0.00000
## 30: 0.00000
## 31: 0.00000
## 32: 0.00000
##     emotion
# graph with emotional valence, what is explanation. Note to self: look up
plot(mysentiment)

#integrate sentiment score into updated dataset
sentimentResultMay2020 <- rawData
sentimentResultMay2020$sentiment_score = mysentiment$sentiment
str(sentimentResultMay2020)
## 'data.frame':    1000 obs. of  36 variables:
##  $ coordinates               : chr  "" "" "" "" ...
##  $ created_at                : chr  "Sat May 16 23:31:16 +0000 2020" "Sat May 16 18:57:19 +0000 2020" "Sun May 17 02:30:46 +0000 2020" "Sat May 16 23:33:45 +0000 2020" ...
##  $ hashtags                  : chr  "" "" "" "" ...
##  $ media                     : chr  "" "" "" "" ...
##  $ urls                      : chr  "" "" "" "https://www.nbcnews.com/now/video/officials-warn-chinese-hackers-are-targeting-u-s-coronavirus-research-83422277503" ...
##  $ favorite_count            : int  0 0 0 0 0 0 0 0 1 1 ...
##  $ id                        : num  1.26e+18 1.26e+18 1.26e+18 1.26e+18 1.26e+18 ...
##  $ in_reply_to_screen_name   : chr  "" "" "" "" ...
##  $ in_reply_to_status_id     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ in_reply_to_user_id       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ lang                      : chr  "en" "en" "en" "en" ...
##  $ place                     : chr  "" "" "" "" ...
##  $ possibly_sensitive        : chr  "" "" "" "false" ...
##  $ quote_id                  : num  NA NA NA NA 1.26e+18 ...
##  $ retweet_count             : int  25 338 441 0 0 12022 4 11 1 0 ...
##  $ retweet_id                : num  1.26e+18 1.26e+18 1.26e+18 NA NA ...
##  $ retweet_screen_name       : chr  "business" "Suewilson91" "BreitbartNews" "" ...
##  $ source                    : chr  "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" "<a href=\"http://www.echofon.com/\" rel=\"nofollow\">Echofon</a>" "<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>" ...
##  $ text                      : chr  "Many Americans have proven diligent in staying home to limit the spread of Covid-19. But their acceptance of so"| __truncated__ "Matt Hancock accused of being a 'liar' and told to resign after claiming he 'protected' care homes from the sta"| __truncated__ "Secretary of State @MikePompeo told Breitbart News that President Donald Trump is â\200œcommittedâ\200\235 to h"| __truncated__ "Officials warn Chinese hackers are targeting U.S. coronavirus research https://t.co/7tnGhf85MS via @nbcnews" ...
##  $ tweet_url                 : chr  "https://twitter.com/lemnosalt/status/1261801422430978048" "https://twitter.com/Hilary72926522/status/1261732478529740807" "https://twitter.com/BillSpears724/status/1261846593453637632" "https://twitter.com/Bet_the_ChE/status/1261802044354113536" ...
##  $ user_created_at           : chr  "Tue Feb 10 00:25:20 +0000 2009" "Sun Dec 01 15:12:16 +0000 2019" "Fri Jan 06 19:30:57 +0000 2017" "Sun Aug 11 19:49:05 +0000 2013" ...
##  $ user_id                   : num  2.05e+07 1.20e+18 8.17e+17 1.66e+09 1.26e+18 ...
##  $ user_default_profile_image: chr  "false" "false" "false" "false" ...
##  $ user_description          : chr  "Groovy chick and media producer. All snark. No bite." "" "" "Just some engineer." ...
##  $ user_favourites_count     : int  92045 19675 1 46635 2788 1371 1230 18960 4 34505 ...
##  $ user_followers_count      : int  1469 45 65 263 426 97 109 2151 375 12607 ...
##  $ user_friends_count        : int  2526 229 228 1960 267 240 274 4846 227 12722 ...
##  $ user_listed_count         : int  73 0 1 1 4 1 0 15 13 106 ...
##  $ user_location             : chr  "" "New Forest" "" "United States" ...
##  $ user_name                 : chr  "Lynn" "Hilary ðŸ’\231" "Bill Spears" "Bet" ...
##  $ user_screen_name          : chr  "lemnosalt" "Hilary72926522" "BillSpears724" "Bet_the_ChE" ...
##  $ user_statuses_count       : int  35678 5272 24796 23697 1028 317 279 84594 14606 252203 ...
##  $ user_time_zone            : logi  NA NA NA NA NA NA ...
##  $ user_urls                 : chr  "http://lynnmargherita.com" "" "" "" ...
##  $ user_verified             : chr  "false" "false" "false" "false" ...
##  $ sentiment_score           : num  0.31 0.00884 -0.12978 -0.21651 0.14912 ...
#identify text for max (positive) sentiment score
max(mysentiment$sentiment)
## [1] 0.9209474
maxSentiment <- sentimentResultMay2020[which.max(sentimentResultMay2020$sentiment_score),]
maxSentiment$text
## [1] "Amsterdam and Milan are both demonstrating how cities can emerge from the #COVID19 crisis stronger and more resilient than before. By embracing innovative approaches to a #GreenRecovery, we can build a more equitable and sustainable future  https://t.co/Gw8Qk4Tbhc"
#identify text for min sentiment score
min(mysentiment$sentiment)
## [1] -1.376195
minSentiment <- sentimentResultMay2020[which.min(sentimentResultMay2020$sentiment_score),]
minSentiment$text
## [1] "Remember Trump's idiotic statement about too much testing showing too many infections?\n\nTrump really thinks this way\n\nHe really doesn't want more testing\n\nhttps://t.co/UJmDE0nvbP"
#write sentiment score to original dataset
write.csv(sentimentResultMay2020,'sentimentResultMay2020.csv')
#Source: https://www.tabvizexplorer.com/sentiment-analysis-using-r-and-twitter/


#score the emotions on each tweet with syuzhet as it breaks emotion into 10 different categories.
# Emotions for each tweet using NRC dictionary
emotions <- get_nrc_sentiment(importdocs)
emo_bar = colSums(emotions)
emo_sum = data.frame(count=emo_bar, emotion=names(emo_bar))
emo_sum$emotion = factor(emo_sum$emotion, levels=emo_sum$emotion[order(emo_sum$count, decreasing = TRUE)])
# visualize results to what type of emotions are dominant in the tweets
# Visualize the emotions from NRC sentiments
library(plotly)
p <- plot_ly(emo_sum, x=~emotion, y=~count, type="bar", color=~emotion) %>%
  layout(xaxis=list(title=""), showlegend=FALSE,
         title="Emotion Type for Covid related hastags (source: IEEE)")
p
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
#Here we see majority of the people are discussing positive about Covid.
# Create comparison word cloud data
wordcloud_tweet = c(
  paste(importdocs[emotions$anger > 0], collapse=" "),
  paste(importdocs[emotions$anticipation > 0], collapse=" "),
  paste(importdocs[emotions$disgust > 0], collapse=" "),
  paste(importdocs[emotions$fear > 0], collapse=" "),
  paste(importdocs[emotions$joy > 0], collapse=" "),
  paste(importdocs[emotions$sadness > 0], collapse=" "),
  paste(importdocs[emotions$surprise > 0], collapse=" "),
  paste(importdocs[emotions$trust > 0], collapse=" ")
)
# create corpus
corpus = Corpus(VectorSource(wordcloud_tweet))
# remove punctuation, convert every word in lower case and remove stop words

corpus = tm_map(corpus, tolower)
## Warning in tm_map.SimpleCorpus(corpus, tolower): transformation drops documents
corpus = tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation drops
## documents
corpus = tm_map(corpus, removeWords, c(stopwords("english")))
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c(stopwords("english"))):
## transformation drops documents
corpus = tm_map(corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
#warning: transformation drops documents
# create document term matrix
tdm = TermDocumentMatrix(corpus)
# convert as matrix
tdm = as.matrix(tdm)
tdmnew <- tdm[nchar(rownames(tdm)) < 11,]
#Graph presents which word contributes to which emotion.

# column name binding
colnames(tdm) = c('anger', 'anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust')
colnames(tdmnew) <- colnames(tdm)
comparison.cloud(tdmnew, random.order=FALSE,
                 colors = c("#00B2FF", "red", "#FF0099", "#6600CC", "green", "orange", "blue", "brown"),
                 title.size=1, max.words=200, scale=c(2.4, 0.4),rot.per=0.4)